Hive复杂数据类型

array

array_contains(column, ‘value’)

hive_array.txt

1 2	pk beijing,shanghai,tianjin,hangzhou jepson changchu,chengdu,wuhan,beijing

create table hive_array(name string, work_locations array<string>)
row format delimited
fields terminated by '\t'  #字段之间的分隔符
COLLECTION ITEMS TERMINATED BY ','; # 数组的分隔符

#导入
LOAD DATA LOCAL INPATH '/home/hadoop/data/hive_array.txt' into table hive_array;

#查询 work_locations 第一个参数
select name,work_locations[0] from hive_array;

#查询 work_locations 的size
select name,size(work_locations) from hive_array;

#求在 tianjin
select * from hive_array where array_contains(work_locations,'tianjin') ;

MAP

MAP<primitive_type, data_type> kv
hive_map.txt

1,zhangsan,father:xiaoming#mother:xiaohuang#brother:xiaoxu,28
2,lisi,father:mayun#mother:huangyi#brother:guanyu,22
3,wangwu,father:wangjianlin#mother:ruhua#sister:jingtian,29
4,mayun,father:mayongzhen#mother:angelababy,26

create table hive_map(
id int, name string, members map<string,string>, age int
)
row format delimited
fields terminated by ','   #字段之间的分隔符
COLLECTION ITEMS TERMINATED BY '#'  #参数之间的分隔符
MAP KEYS TERMINATED BY ':'    #key value之间的分隔符
;

#导入
LOAD DATA LOCAL INPATH '/home/hadoop/data/hive_map.txt' into table hive_map;

#取value
select id,name,members['father'] as father,members['mother'] as mother from hive_map;

#取key
select id,name,map_keys(members) as relations,age from hive_map;

#取key
select id,name,map_values(members) as relations,age from hive_map;

#size
select id,name,size(members) from hive_map;

struct

structs: STRUCT<col_name : data_type [COMMENT col_comment], …>
内部的数据类型可以不同

192.168.1.1#zhangsan:40

hive_struct.txt

192.168.1.1#zhangsan:40
192.168.1.2#lisi:50
192.168.1.3#wangwu:60
192.168.1.4#zhaoliu:70

create table hive_struct(
ip string, info struct<name:string,age:int>
)
row format delimited
fields terminated by '#'
COLLECTION ITEMS TERMINATED BY ':'  #struct的分隔符是':'(这句语句也可以用于array)
;

#导入
LOAD DATA LOCAL INPATH '/home/hadoop/data/hive_struct.txt' into table hive_struct;

#取出
select ip,info.name,info.age from hive_struct;

练习

click_log.txt

11	ad_101	2014-05-01 06:01:12.334+01
22	ad_102	2014-05-01 07:28:12.342+01
33	ad_103	2014-05-01 07:50:12.33+01
11	ad_104	2014-05-01 09:27:12.33+01
22	ad_103	2014-05-01 09:03:12.324+01
33	ad_102	2014-05-02 19:10:12.343+01
11	ad_101	2014-05-02 09:07:12.344+01
35	ad_105	2014-05-03 11:07:12.339+01
22	ad_104	2014-05-03 12:59:12.743+01
77	ad_103	2014-05-03 18:04:12.355+01
99	ad_102	2014-05-04 00:36:39.713+01
33	ad_101	2014-05-04 19:10:12.343+01
11	ad_101	2014-05-05 09:07:12.344+01
35	ad_102	2014-05-05 11:07:12.339+01
22	ad_103	2014-05-05 12:59:12.743+01
77	ad_104	2014-05-05 18:04:12.355+01
99	ad_105	2014-05-05 20:36:39.713+01

ad_list.txt

ad_101	http://www.google.com	catalog8|catalog1
ad_102	http://www.sohu.com	catalog6|catalog3
ad_103	http://www.baidu.com	catalog7
ad_104	http://www.qq.com	catalog5|catalog1|catalog4|catalog9
ad_105	http://sina.com

create table ad_list(ad_id string,url string,catalogs string)row format delimited fields terminated by '\t';	

人 访问的所有ad_id 
去重	
select cookie_id, collect_set(ad_id) from click_log group by cookie_id;	
不去重
select cookie_id, collect_set(ad_id) from click_log group by cookie_id;	


select
click.cookie_id,click.ad_id,click.amount,ad_list.catalogs
from
(select cookie_id, ad_id, count(1) amount from click_log group by cookie_id, ad_id) click
join ad_list
on ad_list.ad_id = click.ad_id;

针对ad_list.txt 中的catalogs列转行
ad_101  catalog8|catalog1
==>
ad_101 catalog8
ad_101 catalog1

select ad_id, catalog from ad_list lateral view outer explode(split(catalogs,'\\|')) t as catalog;

注意：\\ 转义字符
lateral view outer explode(split(catalogs,'\\|')) t as catalog


将catalogs字段转换为数组可以对其内部排序
create table ad_list_2(ad_id string,url string,catalogs array<string>)row format delimited fields terminated by '\t' COLLECTION ITEMS TERMINATED BY '|';	
select ad_id,sort_array(catalogs) from ad_list_2;

[Hive] Hive之复杂类型

Hive复杂数据类型(array<T>、MAP、struct)、练习

Hive复杂数据类型

array

MAP

struct

练习

FEATURED TAGS